Participants should leave this workshop with an ability to:
libraries
list.of.packages <- c("bigrquery","plotly","scales","RColorBrewer","data.table","dplyr","knitr","corrplot","Hmisc","stats")
new.packages<-list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
if(length(new.packages)>0)
{install.packages(new.packages)}
library(bigrquery)
library(plotly)
library(scales)
library(RColorBrewer)
library(data.table)
library(dplyr)
library(knitr)
library(corrplot)
library(Hmisc)
library(stats)
project_HST = "hst-953-2019"The name of the databases are:
The bigrquery package makes it easy to work with data stored in Google BigQuery by allowing you to query BigQuery tables and retrieve metadata about your projects, datasets, tables, and jobs.
Nursecharting: information entered in a semi-structured form by the nurse https://eicu-crd.mit.edu/tutorials/nursecharting/
vitalAperiodic: provides invasive vital sign data which is interfaced into eCareManager at irregular intervals.
vitalPeriodic: data which is consistently interfaced from bedside vital signs monitors into eCareManager. Data are generally interfaced as 1 minute averages, and archived into the vitalPeriodic table as 5 minute median values.
pivoted_vital: The script pivoted-vital.sql includes a preprocessed, pivoted version of nurse charting table of vital signs: https://github.com/MIT-LCP/mimic-code/tree/master/concepts/pivot
lab: Laboratory tests that have have been mapped to a standard set of measurements. https://eicu-crd.mit.edu/tutorials/lab/
customLab: Standardized labs are included in the ‘lab’ table. Laboratory measurements that are not configured within the standard interface.
## patientunitstayid chartoffset entryoffset heartrate
## Min. :2504725 Min. : -12.0 Min. : -12.0 Min. : 89.0
## 1st Qu.:2504725 1st Qu.: 250.5 1st Qu.: 250.5 1st Qu.: 96.0
## Median :2504725 Median : 520.5 Median : 520.5 Median :103.5
## Mean :2504725 Mean : 583.6 Mean : 583.6 Mean :105.1
## 3rd Qu.:2504725 3rd Qu.: 854.2 3rd Qu.: 854.2 3rd Qu.:112.0
## Max. :2504725 Max. :1353.0 Max. :1353.0 Max. :126.0
## NA's :6
## respiratoryrate spo2 nibp_systolic nibp_diastolic
## Min. :18.0 Min. : 80.00 Min. :103.0 Min. :49.00
## 1st Qu.:27.5 1st Qu.: 88.00 1st Qu.:110.0 1st Qu.:60.00
## Median :37.0 Median : 89.50 Median :114.0 Median :63.00
## Mean :37.0 Mean : 89.41 Mean :118.2 Mean :64.86
## 3rd Qu.:46.5 3rd Qu.: 91.00 3rd Qu.:127.0 3rd Qu.:69.50
## Max. :56.0 Max. :100.00 Max. :148.0 Max. :85.00
## NA's :92 NA's :4 NA's :15 NA's :15
## nibp_mean temperature temperaturelocation ibp_systolic
## Min. : 66.00 Min. :36.70 Length:94 Min. : NA
## 1st Qu.: 74.00 1st Qu.:36.88 Class :character 1st Qu.: NA
## Median : 79.00 Median :37.40 Mode :character Median : NA
## Mean : 82.52 Mean :37.27 Mean :NaN
## 3rd Qu.: 87.50 3rd Qu.:37.70 3rd Qu.: NA
## Max. :118.00 Max. :37.80 Max. : NA
## NA's :15 NA's :78 NA's :94
## ibp_diastolic ibp_mean
## Min. : NA Min. : NA
## 1st Qu.: NA 1st Qu.: NA
## Median : NA Median : NA
## Mean :NaN Mean :NaN
## 3rd Qu.: NA 3rd Qu.: NA
## Max. : NA Max. : NA
## NA's :94 NA's :94
Adapt the previous R/SQL code uncommenting the chunk below so it extracts data for all patients limiting the results to 5000 rows.
bpdatatobeplotted<-df_vitals_eicu%>%
select(chartoffset,nibp_systolic,nibp_diastolic,nibp_mean,ibp_systolic,ibp_diastolic,ibp_mean)
x <- list(
title = "Time from ICU admission (hours)"
,titlefont = 'Ubuntu'
)
y <- list(
title = "Value (mmHg)"
,titlefont = 'Ubuntu'
)
bp <- plot_ly(data = bpdatatobeplotted,
x = ~chartoffset/60
,marker=list(size=12, opacity=0.6)
)%>%
add_trace( y = ~nibp_systolic,name='nibp_systolic') %>%
add_trace( y = ~nibp_diastolic,name='nibp_diastolic') %>%
add_trace(y = ~nibp_mean,name='nibp_mean') %>%
add_trace(y = ~ibp_systolic,name='ibp_systolic') %>%
add_trace(y = ~ibp_diastolic,name='ibp_diastolic') %>%
add_trace(y = ~ibp_mean,name='ibp_mean') %>%
layout(
title = "Non-Invasive Blood Pressure in eICU in the first 24h of ICU admission",
xaxis = x, yaxis = y
)
bpUncomment and modify the following R code so it shows only heart rate and spo2 data.
# exercise2<-df_vitals_eicu%>%
# select(chartoffset,nibp_systolic,nibp_diastolic,nibp_mean,ibp_systolic,ibp_diastolic,ibp_mean)
#
# x <- list(
# title = "Time from ICU admission (hours)"
# ,titlefont = 'Ubuntu'
# )
# y <- list(
# title = "Value (mmHg)"
# ,titlefont = 'Ubuntu'
# )
#
# bp <- plot_ly(data = bpdatatobeplotted,
# x = ~chartoffset/60
# ,marker=list(size=12, opacity=0.6)
# )%>%
# add_trace( y = ~nibp_systolic,name='nibp_systolic') %>%
# add_trace( y = ~nibp_diastolic,name='nibp_diastolic') %>%
# add_trace(y = ~nibp_mean,name='nibp_mean') %>%
# add_trace(y = ~ibp_systolic,name='ibp_systolic') %>%
# add_trace(y = ~ibp_diastolic,name='ibp_diastolic') %>%
# add_trace(y = ~ibp_mean,name='ibp_mean') %>%
#
# layout(
# title = "Non-Invasive Blood Pressure in eICU in the first 24h of ICU admission",
# xaxis = x, yaxis = y
# )
#
#
# exercise2Exercise 2 final result
Temp <- df_vitals_eicu[!is.na(df_vitals_eicu$nursingchartvalue) & df_vitals_eicu$nursingchartcelltypevalname %like% 'Temperature',]
x <- list(
title = "Time from ICU admission (hours)",
showticklabels = TRUE,
showgrid = FALSE
)
y <- list(
title = "Value (?C)",
side = "left",
showticklabels = TRUE
)
temp <- plot_ly(df_vitals_eicu) %>%
add_markers(x = ~chartoffset/60
,y = ~temperature
,color = ~temperaturelocation
,type = 'scatter', mode = 'markers', marker=list(size=15, opacity=0.6 )) %>%
layout(title = "Temperature for a patient in the first 24h of ICU admission", xaxis = x, yaxis = y, showlegend = T, legend = list(x = 1.1, y = 0.9))
tempNumber of samples per vital sign to analyze missing data.
distribution<-as.data.frame(colSums(!is.na(df_vitals_eicu)))
distribution<-rename(distribution, 'complete'='colSums(!is.na(df_vitals_eicu))')
distribution['missing']<-colSums(is.na(df_vitals_eicu))
missingplot <- plot_ly(distribution, x = row.names(distribution), y = ~complete, type = 'bar', name = 'Complete values', marker=list(color='#2980b9')) %>%
add_trace(y = ~missing, name = 'Missing Values', marker=list(color='#2c3e50')) %>%
layout(title = 'No. of measurements per variable for a patient in the first 24h of ICU admission'
, barmode = 'stack'
, yaxis =list(title='measurements'))
missingploteICU has different data available depending on the hospital, so we need to analise the frequency distribution hospital by hospital.
freqperhospital<-query_exec("
WITH
hosp_rank AS(
SELECT
-- this query is creating a ranking of hospitals based on the number of observations on nursecharting
hospitalid,
COUNT(*) AS observations
FROM
`physionet-data.eicu_crd.nursecharting` nursecharting
LEFT JOIN
`physionet-data.eicu_crd.patient` patient
ON
patient.patientunitstayid = nursecharting.patientunitstayid
GROUP BY
hospitalid
ORDER BY
observations DESC
LIMIT
10 ),
sq AS (
SELECT
patient.hospitalid,
nursingchartoffset - lag AS SamplingTime,
COUNT(nursingchartoffset - lag) AS frequency,
ROW_NUMBER() OVER (PARTITION BY hospitalid ORDER BY COUNT(nursingchartoffset - lag) DESC) AS position
FROM (
SELECT
patientunitstayid,
nursingchartoffset,
LAG(nursingchartoffset) OVER (PARTITION BY patientunitstayid ORDER BY patientunitstayid ASC, nursingchartoffset ASC) AS lag
FROM
`physionet-data.eicu_crd.nursecharting` nursecharting
WHERE
nursingchartcelltypevallabel = 'Heart Rate'
AND nursingchartoffset <= 1440 --This is the time window for the upper cut point.
ORDER BY
patientunitstayid,
nursingchartoffset ) AS sq
LEFT JOIN
`physionet-data.eicu_crd.patient` patient
ON
patient.patientunitstayid = sq.patientunitstayid
GROUP BY
hospitalid,
SamplingTime
ORDER BY
hospitalid,
frequency DESC )
SELECT
sq.hospitalid,
SamplingTime,
frequency
FROM
sq
INNER JOIN
-- this join is going to get only hospitals in the top 10
hosp_rank
ON
sq.hospitalid = hosp_rank.hospitalid
WHERE
position <=3 --we only want the top 3 frequencies per hospital
"
,project = project_HST ,use_legacy_sql = F)
freqperhospital## hospitalid SamplingTime frequency
## 1 73 15 151241
## 2 73 60 55934
## 3 73 30 46265
## 4 400 15 106503
## 5 400 5 10079
## 6 400 30 7881
## 7 199 60 55355
## 8 199 15 22956
## 9 199 120 9769
## 10 264 15 81496
## 11 264 60 41198
## 12 264 30 34792
## 13 243 15 50103
## 14 243 60 39283
## 15 243 30 13555
## 16 252 60 57165
## 17 252 15 14078
## 18 252 5 9534
## 19 338 15 157525
## 20 338 30 25141
## 21 338 60 23347
## 22 167 15 97299
## 23 167 60 44636
## 24 167 30 17333
## 25 420 60 76344
## 26 420 15 9837
## 27 420 30 8946
## 28 443 60 65228
## 29 443 15 7713
## 30 443 30 3952
Uncomment and modify the time window from the following query so it extracts data within the first 6 hours prior to ICU admission and 72 hours after it.
# exercise3<-query_exec("
# WITH
# hosp_rank AS(
# SELECT
# -- this query is creating a ranking of hospitals based on the number of observations on nursecharting
# hospitalid,
# COUNT(*) AS observations
# FROM
# `physionet-data.eicu_crd.nursecharting` nursecharting
# LEFT JOIN
# `physionet-data.eicu_crd.patient` patient
# ON
# patient.patientunitstayid = nursecharting.patientunitstayid
# GROUP BY
# hospitalid
# ORDER BY
# observations DESC
# LIMIT
# 10 ),
# sq AS (
# SELECT
# patient.hospitalid,
# nursingchartoffset - lag AS SamplingTime,
# COUNT(nursingchartoffset - lag) AS frequency,
# ROW_NUMBER() OVER (PARTITION BY hospitalid ORDER BY COUNT(nursingchartoffset - lag) DESC) AS position
# FROM (
# SELECT
# patientunitstayid,
# nursingchartoffset,
# LAG(nursingchartoffset) OVER (PARTITION BY patientunitstayid ORDER BY patientunitstayid ASC, nursingchartoffset ASC) AS lag
# FROM
# `physionet-data.eicu_crd.nursecharting` nursecharting
# WHERE
# nursingchartcelltypevallabel = 'Heart Rate'
# AND nursingchartoffset <= 1440 --This is the time window for the upper cut point.
# ORDER BY
# patientunitstayid,
# nursingchartoffset ) AS sq
# LEFT JOIN
# `physionet-data.eicu_crd.patient` patient
# ON
# patient.patientunitstayid = sq.patientunitstayid
# GROUP BY
# hospitalid,
# SamplingTime
# ORDER BY
# hospitalid,
# frequency DESC )
# SELECT
# sq.hospitalid,
# SamplingTime,
# frequency
# FROM
# sq
# INNER JOIN
# -- this join is going to get only hospitals in the top 10
# hosp_rank
# ON
# sq.hospitalid = hosp_rank.hospitalid
# WHERE
# position <=3 --we only want the top 3 frequencies per hospital
#
# "
# ,project = project_HST ,use_legacy_sql = F)
#
# exercise3blues <- brewer.pal(9, "Blues")
BrBG <- brewer.pal(11, "BrBG")
hosp_data<-dcast(freqperhospital ,hospitalid + frequency~SamplingTime )
hosp_data$frequency<-NULL
freq <- plot_ly(hosp_data, x = ~as.factor(hospitalid), y = hosp_data$`5`, type = 'bar', name = '5 min' , marker=list(color=BrBG[8])) %>%
add_trace(y = hosp_data$`15`, name = '15 min freq',marker=list(color=blues[8])) %>%
add_trace(y = hosp_data$`30`, name = '30 min freq',marker=list(color=blues[5])) %>%
add_trace(y = hosp_data$`60`, name = '60 min freq',marker=list(color='#e84118')) %>%
add_trace(y = hosp_data$`120`, name = '120 min freq',marker=list(color='#e1b12c')) %>%
layout(title='Heart Rate SamplingTime in top 10 hospitals'
,yaxis = list(title = 'Count'), barmode = 'group'
,xaxis=list(title='hospital id')
)
freqa <- list(
title = "Value",
showticklabels = TRUE
)
b <- list(
showticklabels = FALSE
)
outliers <- plot_ly(data = df_vitals_eicu,y = ~heartrate, type = "box", name='Heart Rate [bpm]', marker = list(color = '#c0392b'), line = list(color = '#c0392b')) %>%
add_trace(y = ~respiratoryrate, name='respiratory rate ins/min', marker = list(color = '#7befb2'), line = list(color = 'rgb(107,174,214)'))%>%
add_trace(y = ~spo2,name='spo2', marker = list(color = '#e67e22'), line = list(color = '#e67e22')) %>%
add_trace(y = ~nibp_systolic, name='nibp_systolic mmHg', marker = list(color = '#fdcb6e'), line = list(color = '#fdcb6e')) %>%
add_trace(y = ~nibp_diastolic, name='nibp_diastolic mmHg', marker = list(color = '#6ab04c'), line = list(color = '#6ab04c')) %>%
add_trace(y = ~nibp_mean, name='nibp_mean mmHg', marker = list(color = '#22a6b3'), line = list(color = '#22a6b3')) %>%
add_trace(y = ~temperature, name='temperature mmHg', marker = list(color = '#8e44ad'), line = list(color = '#8e44ad')) %>%
layout(title = "Distribution of vitals for a patient in the first 24h of ICU admission", yaxis = a, xaxis = b, showlegend = TRUE)
outliersWe can visualize graphically how variables correlate between each other.
corr_data<-df_vitals_eicu %>%
select(heartrate,temperature,spo2)
corr_data<-corr_data[complete.cases(corr_data),]
cor_total_min<-cor(corr_data, use = "pairwise.complete.obs")
#We are looking for a sensible multiple imputation strategy to fill in the spaces so we use "pairwise.complete.obs".
# the method used by defauld is pearson
#non interactive corplot
corrplot(cor_total_min, tl.col = "black" )Uncomment and modify the following code so the correlation plot includes all non-invasive blood pressures (nibp_systolic,nibp_diastolic,nibp_mean) as well.
# exercise4<-df_vitals_eicu %>%
# select(heartrate,temperature,spo2)
#
# exercise4<-exercise4[complete.cases(exercise4),]
#
# cor_total_min<-cor(exercise4, use = "pairwise.complete.obs")
# #We are looking for a sensible multiple imputation strategy to fill in the spaces so we use "pairwise.complete.obs".
# # the method used by defauld is pearson
#
# #non interactive corplot
# corrplot(cor_total_min, tl.col = "black" )It may help to add a pre-ICU admission safety window because sometimes laboratory measurements are measured pre-ICU, e.g., look at all the labs measured from (-6 * 60) minutes to (24 * 60) minutes from ICU admission.
labsforapatient.eicu<-query_exec("
SELECT
ROUND(CAST (labresultoffset/60 AS FLOAT64),2) AS labresultoffset_hrs,
labname,
labresult
FROM
`physionet-data.eicu_crd.lab` lab
WHERE
patientunitstayid = 199499
AND labname NOT IN ('-bands'
,'-monos'
,'-lymphs'
,'-polys')
AND CAST (labresultoffset/60 AS FLOAT64) <= 24
ORDER BY
labresultoffset_hrs",use_legacy_sql = F, project = project_HST )
labsforapatient.eicu## labresultoffset_hrs labname labresult
## 1 0.13 lactate 2.80
## 2 0.13 calcium 8.20
## 3 0.13 platelets x 1000 69.00
## 4 0.13 ionized calcium 4.44
## 5 0.13 MCV 101.20
## 6 0.13 Hct 25.20
## 7 0.13 MCH 32.10
## 8 0.13 MCHC 31.70
## 9 0.13 BUN 35.00
## 10 0.13 total protein 4.70
## 11 0.13 ALT (SGPT) 40.00
## 12 0.13 Hgb 8.00
## 13 0.13 glucose 83.00
## 14 0.13 potassium 3.60
## 15 0.13 sodium 136.00
## 16 0.13 albumin 2.70
## 17 0.13 phosphate 2.30
## 18 0.13 chloride 98.00
## 19 0.13 magnesium 1.60
## 20 0.13 creatinine 1.73
## 21 0.13 RDW 26.70
## 22 0.13 bicarbonate 28.00
## 23 0.13 AST (SGOT) 27.00
## 24 0.13 RBC 2.49
## 25 0.13 alkaline phos. 241.00
## 26 0.13 WBC x 1000 13.40
## 27 0.13 anion gap 14.00
## 28 0.13 total bilirubin 5.70
## 29 8.32 bedside glucose 91.00
## 30 12.08 RDW 27.60
## 31 12.08 alkaline phos. 245.00
## 32 12.08 total bilirubin 5.60
## 33 12.08 total protein 4.90
## 34 12.08 MCH 31.70
## 35 12.08 WBC x 1000 16.10
## 36 12.08 MCV 101.50
## 37 12.08 calcium 8.80
## 38 12.08 magnesium 1.80
## 39 12.08 potassium 4.00
## 40 12.08 RBC 2.71
## 41 12.08 MCHC 31.30
## 42 12.08 bicarbonate 27.00
## 43 12.08 Hgb 8.60
## 44 12.08 platelets x 1000 97.00
## 45 12.08 Hct 27.50
## 46 12.08 sodium 132.00
## 47 12.08 glucose 98.00
## 48 12.08 creatinine 2.00
## 49 12.08 BUN 45.00
## 50 12.08 AST (SGOT) 30.00
## 51 12.08 chloride 94.00
## 52 12.08 anion gap 15.00
## 53 12.08 lactate 1.20
## 54 12.08 albumin 2.70
## 55 12.08 phosphate 3.20
## 56 12.08 ALT (SGPT) 41.00
## 57 12.33 PTT 51.00
## 58 14.42 bedside glucose 100.00
## 59 21.25 bedside glucose 123.00
We can visualize the dispersion and range of values throughout time for all lab results.
labs <- labsforapatient.eicu %>% plot_ly(x=labsforapatient.eicu$labresultoffset_hrs
,y=labsforapatient.eicu$labresult
,color=labsforapatient.eicu$labname
,marker=list(size=12)) %>%
layout(title='Lab results for a patient over time in the first 24 hours of ICU admission'
,yaxis = list(title = 'Value')
,xaxis=list(title='Time from ICU admission (hours)')
)
labsUncomment and modify the following R code so it plots only Hemoglobin and Glucose for the first 24 hours of ICU admission.
# exercise5 <- labsforapatient.eicu %>% plot_ly(x=labsforapatient.eicu$labresultoffset_hrs
# ,y=labsforapatient.eicu$labresult
# ,color=labsforapatient.eicu$labname
# ,marker=list(size=12)) %>%
# layout(title='Exercise 5'
# ,yaxis = list(title = 'Value')
# ,xaxis=list(title='Time from ICU admission (hours)')
# )
# exercise5Exercise 5 final result
As is common in eICU-CRD, there are a subset of hospitals who routinely utilize a given portion of the medical record, while there are other hospitals who rarely use a given interface and thus have poor data completion.
labsforapatient.eicu<-query_exec("
with sq as (SELECT hospitalid
,lab.patientunitstayid
,CASE
WHEN labname = 'bedside glucose' THEN count(labresult)
END AS samplesperpatient_glucose
,CASE
WHEN labname = 'Hgb' THEN count(labresult)
END AS samplesperpatient_hgb
FROM `physionet-data.eicu_crd.lab` lab
INNER JOIN `physionet-data.eicu_crd.patient` patient
ON
lab.patientunitstayid = patient.patientunitstayid
WHERE
labname = 'bedside glucose'
OR
labname = 'Hgb'
GROUP BY hospitalid,patientunitstayid,labname
ORDER BY
hospitalid
)
SELECT
hospitalid hospitalid
,ROUND(AVG(samplesperpatient_glucose),2) average_glucose
,ROUND(AVG(samplesperpatient_hgb),2) average_hgb
FROM sq
GROUP BY hospitalid
ORDER BY average_glucose,average_hgb DESC
LIMIT 50
"
,use_legacy_sql = F, project = project_HST )
labsforapatient.eicu## hospitalid average_glucose average_hgb
## 1 212 NA 33.84
## 2 452 NA 11.60
## 3 350 NA 11.05
## 4 243 NA 6.11
## 5 244 NA 6.06
## 6 323 NA 6.00
## 7 204 NA 5.76
## 8 445 NA 5.64
## 9 423 NA 5.13
## 10 355 NA 4.88
## 11 425 NA 4.82
## 12 413 NA 4.74
## 13 201 NA 4.66
## 14 422 NA 4.43
## 15 246 NA 4.26
## 16 245 NA 4.03
## 17 412 NA 3.90
## 18 263 NA 3.32
## 19 267 NA 2.20
## 20 361 NA 1.88
## 21 123 NA 1.08
## 22 120 NA 1.00
## 23 112 NA 1.00
## 24 133 NA 0.98
## 25 385 1.00 1.67
## 26 138 1.00 1.24
## 27 91 5.00 3.50
## 28 84 6.52 3.00
## 29 156 7.54 2.42
## 30 411 8.67 5.58
## 31 351 8.80 4.69
## 32 93 9.16 3.34
## 33 179 9.16 2.45
## 34 69 9.42 3.62
## 35 61 9.78 3.27
## 36 342 9.93 2.93
## 37 59 10.12 3.72
## 38 174 10.36 2.80
## 39 164 10.80 3.96
## 40 265 10.90 3.00
## 41 151 11.00 2.00
## 42 424 11.13 5.20
## 43 279 11.43 4.85
## 44 158 11.92 3.27
## 45 135 12.00 2.55
## 46 205 12.13 2.84
## 47 250 12.51 3.40
## 48 328 12.86 3.06
## 49 282 12.88 4.54
## 50 182 12.99 4.29
labsforapatient.eicu<-labsforapatient.eicu[complete.cases(labsforapatient.eicu),]
paired <- brewer.pal(10, "Paired")
hosp_data$frequency<-NULL
hgb_gluc_perhosp <- plot_ly(labsforapatient.eicu, x = ~as.factor(hospitalid), y = ~average_glucose, type = 'bar', name = 'avg glucose' , marker=list(color=paired[2])) %>%
add_trace(y = ~average_hgb, name = 'avg hgb',marker=list(color=paired[6])) %>%
layout(title='Hgb and glucose mean observations per hospital'
,yaxis = list(title = 'Count'), barmode = 'group'
,xaxis=list(title='hospital id')
)
hgb_gluc_perhospchartevents contains all the charted data available for a patient. During their ICU stay, the primary repository of a patient’s information is their electronic chart. The electronic chart displays patients’ routine vital signs and any additional information relevant to their care: ventilator settings, laboratory values, code status, mental status, and so on. As a result, the bulk of information about a patient’s stay is contained in CHARTEVENTS. Furthermore, even though laboratory values are captured elsewhere (LABEVENTS), they are frequently repeated within CHARTEVENTS. https://github.com/MIT-LCP/mimic-code/blob/master/tutorials/explore-items.Rmd
pivoted_vital: The script pivoted-vital.sql includes a preprocessed, pivoted version of nurse charting table of vital signs: The script pivoted-vital.sql includes a preprocessed, pivoted version of vital signs: https://github.com/MIT-LCP/mimic-code/tree/master/concepts/pivot
labevents data contains information regarding laboratory based measurements. The process for acquiring a lab measurement is as follows: first, a member of the clinical staff acquires a fluid from a site in the patient’s body (e.g. blood from an arterial line, urine from a catheter, etc). Next, the fluid is bar coded to associate it with the patient and timestamped to record the time of the fluid acquisition. The lab analyses the data and returns a result within 4-12 hours.
patientunitstayid <-245727
sql<-paste("SELECT distinct subject_id, hadm_id, icustay_id, max(heartrate) as heartrate, max(sysbp) as sysbp, max(diasbp) as diasbp, max(meanbp) as meanbp, max(resprate) as resprate, max(temperature) as temperature, max(spo2) as spo2, charttime
FROM (
SELECT distinct pvt.subject_id, pvt.hadm_id, pvt.icustay_id, pvt.charttime
, (case when VitalID = 1 then valuenum else null end) as HeartRate
, (case when VitalID = 2 then valuenum else null end) as SysBP
, (case when VitalID = 3 then valuenum else null end) as DiasBP
, (case when VitalID = 4 then valuenum else null end) as MeanBP
, (case when VitalID = 5 then valuenum else null end) as RespRate
, (case when VitalID = 6 then valuenum else null end) as Temperature
, (case when VitalID = 7 then valuenum else null end) as SpO2
FROM (
select distinct ie.subject_id, ie.hadm_id, ie.icustay_id, charttime
, case
when itemid in (211,220045) then 1 -- HeartRate
when itemid in (51,442,455,6701,220179,220050) then 2 -- SysBP
when itemid in (8368,8440,8441,8555,220180,220051) then 3 -- DiasBP
when itemid in (456,52,6702,443,220052,220181,225312) then 4 -- MeanBP
when itemid in (223761,223762,676,678) then 6 -- Temp
when itemid in (646,220277) then 7 -- SpO2
else null end as VitalID
, valuenum
from `physionet-data.mimiciii_clinical.icustays` ie
left join `physionet-data.mimiciii_clinical.chartevents` ce
on ie.subject_id = ce.subject_id and ie.hadm_id = ce.hadm_id and ie.icustay_id = ce.icustay_id
and ce.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where ce.itemid in
(
211,220045, --'Heart Rate'
51, 442, 455, 6701, 220179, 220050, 8368, 8440, 8441, 8555, 220180, 220051, -- Systolic/diastolic
456,52, 6702, 443, 220052, 220181, 225312, -- MEAN ARTERIAL PRESSURE
618, 615, 220210, 224690, -- RESPIRATORY RATE
646, 220277, -- SPO2, peripheral
223762, 676,223761, 678 -- TEMPERATURE
)
) pvt
where pvt.icustay_id =",patientunitstayid,") group by subject_id, hadm_id, icustay_id, charttime ORDER BY charttime")
df_vitals<-query_exec(sql, project = project_HST ,use_legacy_sql = FALSE)First, we need to visualize the data and understand if it makes sense. Mean blood pressure is calculated based on systolic and diastolic blood pressures, and we can visualize it in the plot.
SysBP <- df_vitals$sysbp
MeanBP <- df_vitals$meanbp
DiasBP <- df_vitals$diasbp
x <- c(1:length(df_vitals$sysbp))
a <- list(
title = "Measurement",
showticklabels = TRUE
)
b <- list(
title = "Value [mmHg]",
showticklabels = TRUE
)
bp_all <- plot_ly(df_vitals, x = ~x, y = ~SysBP, name = 'SysBP', type = 'scatter', mode = 'markers', marker=list(size=12, opacity=0.6)) %>%
add_trace(y = ~MeanBP, name = 'MeanBP', mode = 'markers', marker=list(size=12 , opacity=0.6, color = 'purple')) %>%
add_trace(y = ~DiasBP, name = 'DiasBP', mode = 'markers', marker = list(size=12 , opacity=0.6, color = '#FF9999')) %>%
layout(title = "Blood pressure for a patient in the first 24h of ICU admission", xaxis = a, yaxis = b, showlegend = TRUE)
bp_allWhen plotting temperature, we observe that the units are in Celsius and Fahrenheit degrees. To further analyse the data, we should then convert temperature into a single unit, such as celsius degrees.
Temp <- df_vitals$temperature[!is.na(df_vitals$temperature)]
x <- c(1:length(Temp))
a <- list(
title = "Measurement",
showticklabels = TRUE,
showgrid = FALSE
)
b <- list(
title = "Value [?F]",
side = "left",
showticklabels = TRUE
)
ay <- list(
overlaying = "y",
side = "right",
title = "Value [?C]",
showticklabels = TRUE,
showgrid = FALSE
)
temp_cf <- plot_ly() %>%
add_markers(x = ~c(1:length(Temp[Temp<50])), y = ~Temp[Temp<50], yaxis = "y2", name = "?C", type = 'scatter', mode = 'markers', marker=list(size=15, opacity=0.6, color = '#ffb142')) %>%
add_markers (x = ~c(1:length(Temp[Temp>50])), y = ~Temp[Temp>50], name = "?F", type = 'scatter', mode = 'markers', marker=list(size=15, opacity=0.8, color = 'rgb(42, 187, 155, 1)')) %>%
layout(title = "Temperature for a patient in the first 24h of ICU admission", xaxis = a, yaxis = b, yaxis2 = ay, showlegend = T, legend = list(x = 1.1, y = 0.9))
temp_cf Number of samples per vital sign to analyse missing data.
We observe that this specific patient has no samples for Respiratory Rate. However, before excluding this variable from a study due to the amount of missing data, we should analyse the number of samples for the cohort of all patients. Only then, should we make the decision to exclude the variable or not.
SysBP <- length(df_vitals$sysbp[!is.na(df_vitals$sysbp)])
MeanBP <- length(df_vitals$meanbp[!is.na(df_vitals$meanbp)])
DiasBP <- length(df_vitals$diasbp[!is.na(df_vitals$diasbp)])
HeartRate <- length(df_vitals$heartrate[!is.na(df_vitals$heartrate)])
RespRate <- length(df_vitals$resprate[!is.na(df_vitals$resprate)])
Temperature <- length(df_vitals$temperature[!is.na(df_vitals$temperature)])
SpO2 <- length(df_vitals$spo2[!is.na(df_vitals$spo2)])
Vars <- c("MeanBP","SysBP","DiasBP","HeartRate","SpO2","Temperature","RespRate")
Count <- c(MeanBP, SysBP, DiasBP, HeartRate, SpO2, Temperature, RespRate)
data <- data.frame(Vars, Count, stringsAsFactors = FALSE)
data$Vars <- factor(data$Vars, levels = unique(data$Vars)[order(data$Count, decreasing = TRUE)])
a <- list(title = ' ')
plot_ly(data, x = ~Vars, y = ~Count, type = "bar", marker = list(opacity=0.6))%>%
layout(title = 'No. of measurements per variable for a patient in the first 24h of ICU admission', xaxis = a)sql<-paste("SELECT TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE) AS SamplingTime, COUNT(TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE)) AS frequency
FROM
(
SELECT charttime, Lag(charttime) over (PARTITION BY ie.icustay_id ORDER BY ie.icustay_id ASC, ce.charttime ASC) AS lag
FROM `physionet-data.mimiciii_clinical.chartevents` ce
left join `physionet-data.mimiciii_clinical.icustays` ie
on ie.subject_id = ce.subject_id and ie.hadm_id = ce.hadm_id and ie.icustay_id = ce.icustay_id
and ce.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where ce.itemid in (211,220045))
GROUP BY SamplingTime ORDER BY frequency DESC limit 11")
df_freqHR<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)Frequency of data acquisition for top 10 sampling times of Heart Rate.
Heart rate is the variable with higher amount of collected samples, comparing to the other vital signs. However, we still have a significant amount of missing data for this variable, which can be observed in the plot for a null sampling time. We conclude that this variable is mostly hourly collected.
a <- list(
title = "Sampling time [min]",
showticklabels = TRUE
)
b <- list(
title = "# samples",
showticklabels = TRUE
)
colors <- c('rgba(222,45,38,0.8)','#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2')
radius <- df_freqHR$frequency[1:10]/40000
t <- list(
x = ~min(df_freqHR$SamplingTime[1:10]),
y = max(df_freqHR$frequency[1:10]),
text = 'Missing data',
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 7,
ax = 80,
ay = -40
)
hr_top_null <- plot_ly(df_freqHR, x = ~df_freqHR$SamplingTime[1:10], y = ~df_freqHR$frequency[1:10], type = 'scatter', mode = 'markers', marker = list(line = list(width = 0), size = radius, opacity = 0.5, color = colors))%>%
layout(annotations = t, title = 'No. samples for top 10 sampling times of data acquisition for Heart Rate', xaxis = a, yaxis = b, xaxis = list(showgrid = FALSE), yaxis = list(showgrid = FALSE))
hr_top_nullPlot frequency of data acquisition for top 10 sampling times of Heart Rate, excluding frequency of null measurements.
Heart rate is mostly collected hourly or every 1 minute for the overall patients.
a <- list(
title = "Sampling time [min]",
showticklabels = TRUE
)
b <- list(
title = "# samples",
showticklabels = TRUE
)
colors <- c('rgba(222,45,38,0.8)', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#0072B2')
radius <- df_freqHR$frequency/30000
t <- list(
family = "sans serif",
size = 44,
color = "white",
x = 60,
y = max(df_freqHR$frequency[2:length(df_freqHR$SamplingTime)]),
text = '1 hour sampling',
showarrow = TRUE,
arrowhead = 7,
ax = 10,
ay = -40
)
hr_top <- plot_ly(df_freqHR, x = ~df_freqHR$SamplingTime[2:length(df_freqHR$SamplingTime)], y = ~df_freqHR$frequency[2:length(df_freqHR$frequency)], type = 'scatter', mode = 'markers', marker = list(size = radius, opacity = 0.5, color = colors))%>%
layout(annotations =t, title = 'No. samples for top 10 sampling times of data acquisition for Heart Rate', xaxis = a, yaxis = b, xaxis = list(showgrid = FALSE), yaxis = list(showgrid = FALSE))
hr_top## Frequency for the vital signs
## SysBP
sql<-paste("SELECT TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE) AS SamplingTime, COUNT(TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE)) AS frequency
FROM
(
SELECT charttime, Lag(charttime) over (PARTITION BY ie.icustay_id ORDER BY ie.icustay_id ASC, ce.charttime ASC) AS lag
FROM `physionet-data.mimiciii_clinical.chartevents` ce
left join `physionet-data.mimiciii_clinical.icustays` ie
on ie.subject_id = ce.subject_id and ie.hadm_id = ce.hadm_id and ie.icustay_id = ce.icustay_id
and ce.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where ce.itemid in (51,442,455,6701,220179,220050))
GROUP BY SamplingTime ORDER BY frequency DESC limit 11")
df_freqSBP<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)
## DiasBP
sql<-paste("SELECT TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE) AS SamplingTime, COUNT(TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE)) AS frequency
FROM
(
SELECT charttime, Lag(charttime) over (PARTITION BY ie.icustay_id ORDER BY ie.icustay_id ASC, ce.charttime ASC) AS lag
FROM `physionet-data.mimiciii_clinical.chartevents` ce
left join `physionet-data.mimiciii_clinical.icustays` ie
on ie.subject_id = ce.subject_id and ie.hadm_id = ce.hadm_id and ie.icustay_id = ce.icustay_id
and ce.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where ce.itemid in (8368,8440,8441,8555,220180,220051))
GROUP BY SamplingTime ORDER BY frequency DESC limit 11")
df_freqDBP<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)
## MeanBP
sql<-paste("SELECT TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE) AS SamplingTime, COUNT(TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE)) AS frequency
FROM
(
SELECT charttime, Lag(charttime) over (PARTITION BY ie.icustay_id ORDER BY ie.icustay_id ASC, ce.charttime ASC) AS lag
FROM `physionet-data.mimiciii_clinical.chartevents` ce
left join `physionet-data.mimiciii_clinical.icustays` ie
on ie.subject_id = ce.subject_id and ie.hadm_id = ce.hadm_id and ie.icustay_id = ce.icustay_id
and ce.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where ce.itemid in (456,52,6702,443,220052,220181,225312))
GROUP BY SamplingTime ORDER BY frequency DESC limit 11")
df_freqMBP<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)
## Temp
sql<-paste("SELECT TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE) AS SamplingTime, COUNT(TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE)) AS frequency
FROM
(
SELECT charttime, Lag(charttime) over (PARTITION BY ie.icustay_id ORDER BY ie.icustay_id ASC, ce.charttime ASC) AS lag
FROM `physionet-data.mimiciii_clinical.chartevents` ce
left join `physionet-data.mimiciii_clinical.icustays` ie
on ie.subject_id = ce.subject_id and ie.hadm_id = ce.hadm_id and ie.icustay_id = ce.icustay_id
and ce.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where ce.itemid in (223761,223762,676,678))
GROUP BY SamplingTime ORDER BY frequency DESC limit 11")
df_freqTemp<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)
## SpO2
sql<-paste("SELECT TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE) AS SamplingTime, COUNT(TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE)) AS frequency
FROM
(
SELECT charttime, Lag(charttime) over (PARTITION BY ie.icustay_id ORDER BY ie.icustay_id ASC, ce.charttime ASC) AS lag
FROM `physionet-data.mimiciii_clinical.chartevents` ce
left join `physionet-data.mimiciii_clinical.icustays` ie
on ie.subject_id = ce.subject_id and ie.hadm_id = ce.hadm_id and ie.icustay_id = ce.icustay_id
and ce.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where ce.itemid in (646,220277))
GROUP BY SamplingTime ORDER BY frequency DESC limit 11")
df_freqSp<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)
## RR
sql<-paste("SELECT TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE) AS SamplingTime, COUNT(TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE)) AS frequency
FROM
(
SELECT charttime, Lag(charttime) over (PARTITION BY ie.icustay_id ORDER BY ie.icustay_id ASC, ce.charttime ASC) AS lag
FROM `physionet-data.mimiciii_clinical.chartevents` ce
left join `physionet-data.mimiciii_clinical.icustays` ie
on ie.subject_id = ce.subject_id and ie.hadm_id = ce.hadm_id and ie.icustay_id = ce.icustay_id
and ce.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where ce.itemid in (615,618,220210,224690))
GROUP BY SamplingTime ORDER BY frequency DESC limit 11")
df_freqRR<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)All vital signs, excluding null frequency of measurements.
We observe that all the variables are mostly hourly collected and that Temperature is the one with the least number of samples.
## [1] '4.9.0'
a <- list(
title = "Sampling time [min]",
showticklabels = TRUE
)
b <- list(
title = "# samples",
showticklabels = TRUE
)
HR <- df_freqHR$frequency[2:11]
Sp <- df_freqSp$frequency[2:11]
Temp <- df_freqTemp$frequency[2:11]
MBP <- df_freqMBP$frequency[2:11]
DBP <- df_freqDBP$frequency[2:11]
SBP <- df_freqSBP$frequency[2:11]
RR <- df_freqRR$frequency[2:11]
samplingTime <- unique(c(df_freqHR$SamplingTime[2:11],df_freqRR$SamplingTime[2:11],df_freqSp$SamplingTime[2:11],df_freqTemp$SamplingTime[2:11], df_freqMBP$SamplingTime[2:11],df_freqDBP$SamplingTime[2:11],df_freqSBP$SamplingTime[2:11]))
radius <- df_freqHR$frequency[2:11]/20000
freq_all <- plot_ly(df_freqHR, x = ~df_freqHR$SamplingTime[2:11], y = ~HR, name = 'Heart Rate [bpm]', type = 'scatter', mode = 'markers', marker=list(size=radius, opacity=0.6)) %>%
add_trace(x = ~df_freqTemp$SamplingTime[2:11], y = ~Temp, name = 'Temp [?C]', mode = 'markers', marker=list(line = list(width = 0), size=radius, opacity=0.6, color = '#4AC6B7')) %>%
add_trace(x = ~df_freqRR$SamplingTime[2:11], y = ~RR, name = 'RespRate [bpm]', mode = 'markers', marker=list(line = list(width = 0), size=radius, opacity=0.6, color = 'blue')) %>%
add_trace(x = ~df_freqSp$SamplingTime[2:11], y = ~Sp, name = 'SpO2 [%]', mode = 'markers', marker=list(line = list(width = 0), size=radius, opacity=0.6, color = 'purple')) %>%
add_trace(x = ~df_freqMBP$SamplingTime[2:11], y = ~MBP, name = 'MeanBP [mmHg]', mode = 'markers', marker=list(line = list(width = 0), size=radius, opacity=0.6, color = '#965F8A')) %>%
add_trace(x = ~df_freqDBP$SamplingTime[2:11], y = ~DBP, name = 'DiasBP [mmHg]', mode = 'markers', marker=list(line = list(width = 0), size=radius, opacity=0.6, color = '#FF7070')) %>%
add_trace(x = ~df_freqSBP$SamplingTime[2:11], y = ~SBP, name = 'SysBP [mmHg]', mode = 'markers', marker=list(line = list(width = 0), size=radius, opacity=0.6, color = '#fdcb6e')) %>%
layout(title = "No. samples for top 10 sampling times of vitals in the first 24h of ICU admission", xaxis = a, yaxis = b, showlegend = TRUE)
freq_allExcept for Heart Rate, all vitals exhibit at least one value outside of the box, which we can denominate an outlier. For temperature, the value closer to 100 degrees is in Fahrenheit, therefore we should convert it to Celsius degrees. For the remaining outliers, we should assess if these are possible values for a patient or if they should be removed.
a <- list(
title = "Value",
showticklabels = TRUE
)
b <- list(
showticklabels = FALSE
)
outliers <- plot_ly(y = ~df_vitals$heartrate, type = "box", name='Heart Rate [bpm]', marker = list(color = '#c0392b'), line = list(color = '#c0392b')) %>%
add_trace(y = ~df_vitals$spo2, name='SpO2 [%]', marker = list(color = 'rgb(107,174,214)'), line = list(color = 'rgb(107,174,214)')) %>%
add_trace(y = ~df_vitals$sysbp,name='SysBP [mmHg]', marker = list(color = '#e67e22'), line = list(color = '#e67e22')) %>%
add_trace(y = ~df_vitals$diasbp, name='DiasBP [mmHg]', marker = list(color = '#fdcb6e'), line = list(color = '#fdcb6e')) %>%
add_trace(y = ~df_vitals$meanbp, name='MeanBP [mmHg]', marker = list(color = '#6ab04c'), line = list(color = '#6ab04c')) %>%
add_trace(y = ~df_vitals$temperature, name='Temp [?C, ?F]', marker = list(color = '#22a6b3'), line = list(color = '#22a6b3')) %>%
layout(title = "Distribution of vitals for a patient in the first 24h of ICU admission", yaxis = a, xaxis = b, showlegend = TRUE)
outliersWe can visualize graphically how variables correlate between each other.
cat <- c("sysbp", "meanbp", "diasbp", "heartrate", "temperature", "spo2")
dummydata<-df_vitals %>%
select(sysbp,meanbp,diasbp,heartrate,temperature,spo2)
cor(dummydata[complete.cases(dummydata),])## sysbp meanbp diasbp heartrate temperature spo2
## sysbp 1.0000000 0.8136145 0.6859170 0.7321013 0.2554570 0.3477659
## meanbp 0.8136145 1.0000000 0.9566656 0.3743161 0.6675277 0.3961054
## diasbp 0.6859170 0.9566656 1.0000000 0.2238832 0.7985887 0.2853291
## heartrate 0.7321013 0.3743161 0.2238832 1.0000000 -0.1366428 0.2805464
## temperature 0.2554570 0.6675277 0.7985887 -0.1366428 1.0000000 0.1382169
## spo2 0.3477659 0.3961054 0.2853291 0.2805464 0.1382169 1.0000000
e1<-dummydata[, names(dummydata) %in% (cat)]
e2 <- data.frame(lapply(e1, function(x) as.numeric(as.character(x))))
cor_total_min<-cor(e2, use = "pairwise.complete.obs")
#We are looking for a sensible multiple imputation strategy to fill in the spaces so we use = "pairwise.complete.obs".
#non interactive corplot
corrplot(cor_total_min, method = "color", tl.col = "black" )patientunitstayid <-245727
sql<-paste("SELECT distinct subject_id, hadm_id, icustay_id, max(GLUCOSE) as Glucose, max(HEMOGLOBIN) as hemoglobin, charttime FROM (
SELECT pvt.subject_id, pvt.hadm_id, pvt.icustay_id, pvt.charttime
, (CASE WHEN label = 'GLUCOSE' THEN valuenum ELSE null END) as GLUCOSE
, (CASE WHEN label = 'HEMOGLOBIN' THEN valuenum ELSE null END) as HEMOGLOBIN
FROM
( -- begin query that extracts the data
SELECT ie.subject_id, ie.hadm_id, ie.icustay_id, charttime
-- here we assign labels to ITEMIDs
-- this also fuses together multiple ITEMIDs containing the same data
, CASE
WHEN itemid in (50809,50931) THEN 'GLUCOSE'
WHEN itemid in (50811,51222) THEN 'HEMOGLOBIN'
ELSE null
END AS label,
valuenum
FROM `physionet-data.mimiciii_clinical.icustays` ie
LEFT JOIN `physionet-data.mimiciii_clinical.labevents` le
ON le.subject_id = ie.subject_id AND le.hadm_id = ie.hadm_id
-- the last 6 hours might have nurses notes with information
AND le.charttime BETWEEN DATETIME_ADD(ie.intime, INTERVAL -6 hour) AND DATETIME_ADD(ie.intime, INTERVAL 24 hour)
AND le.ITEMID in
(
50931, 50809, -- GLUCOSE
51222,50811 -- HEMOGLOBIN
)
AND valuenum IS NOT null AND valuenum > 0 -- lab values cannot be 0 and cannot be negative
) pvt
where pvt.icustay_id =",patientunitstayid,"ORDER BY pvt.charttime) group by subject_id, hadm_id, icustay_id, charttime")
df_labs<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE) We can visualize the dispersion and range of values throughout time for glucose and hemoglobin.
glucose <- df_labs$Glucose
hemoglobin <- df_labs$hemoglobin
a <- list(
title = "Measurement",
showticklabels = TRUE,
showgrid = FALSE
)
b <- list(
title = "Hemoglobine [g/dL]",
side = "left",
showticklabels = TRUE
)
ay <- list(
overlaying = "y",
side = "right",
title = "Glucose [mg/dL]",
showticklabels = TRUE,
showgrid = FALSE
)
glucose_hgb <- plot_ly() %>%
add_markers(x = ~c(1:length(glucose)), y = ~glucose, yaxis = "y2", name = "Glucose", type = 'scatter', mode = 'markers', marker=list(size=15, opacity=0.6, color = '#fab1a0')) %>%
add_markers (x = ~c(1:length(hemoglobin)), y = ~hemoglobin, name = "Hemoglobin", type = 'scatter', mode = 'markers', marker=list(size=15, opacity=0.6, color = 'rgb(37, 116, 169, 1)')) %>%
layout(title = "Laboratory results for a patient in the first 24h of ICU admission", xaxis = a, yaxis = b, yaxis2 = ay, showlegend = T, legend = list(x = 1.1, y = 0.9))
glucose_hgbsql<-paste("SELECT TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE) AS SamplingTime, COUNT(TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE)) AS frequency
FROM
(
SELECT charttime, Lag(charttime) over (PARTITION BY ie.icustay_id ORDER BY ie.icustay_id ASC, le.charttime ASC) AS lag
FROM `physionet-data.mimiciii_clinical.labevents` le
left join `physionet-data.mimiciii_clinical.icustays` ie
on ie.subject_id = le.subject_id and ie.hadm_id = le.hadm_id
and le.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where le.itemid in (50931, 50809))
GROUP BY SamplingTime ORDER BY frequency DESC limit 11")
df_freqGlu<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)
sql<-paste("SELECT TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE) AS SamplingTime, COUNT(TIMESTAMP_DIFF(CAST(charttime AS TIMESTAMP), CAST(lag AS TIMESTAMP), MINUTE)) AS frequency
FROM
(
SELECT charttime, Lag(charttime) over (PARTITION BY ie.icustay_id ORDER BY ie.icustay_id ASC, le.charttime ASC) AS lag
FROM `physionet-data.mimiciii_clinical.labevents` le
left join `physionet-data.mimiciii_clinical.icustays` ie
on ie.subject_id = le.subject_id and ie.hadm_id = le.hadm_id
and le.charttime between ie.intime and DATETIME_ADD(ie.intime, INTERVAL 24 hour)
where le.itemid in (51222,50811))
GROUP BY SamplingTime ORDER BY frequency DESC limit 11")
df_freqHemog<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)Frequency of data acquisition for top 10 sampling times of Hemoglobin and Glucose
Hemoglobin and glucose are mostly sampled at a 5 minute interval.
a <- list(
title = "Sampling time [min]",
showticklabels = TRUE
)
b <- list(
title = "# samples",
showticklabels = TRUE
)
hemoglobin<-df_freqHemog[!(df_freqHemog$SamplingTime==0),]
glucose<-df_freqGlu[!(df_freqGlu$SamplingTime==0),]
samplingTime <- unique(c(hemoglobin$SamplingTime,glucose$SamplingTime))
glu <- glucose$frequency
hemog <- hemoglobin$frequency
radius <- hemoglobin$frequency/1000
lab_top <- plot_ly(glucose, x = ~glucose$SamplingTime, y = ~glu, name = 'Glucose [mg/dL]', type = 'scatter', mode = 'markers', marker=list(size=radius, opacity=0.6)) %>%
add_trace(x = ~hemoglobin$SamplingTime, y = ~hemog, name = 'Hemoglobine [g/dL]', mode = 'markers', marker=list(line = list(width = 0), size=radius, opacity=0.6, color = '#4AC6B7')) %>%
layout(title = "No. samples for top 10 sampling times for lab results in the first 24h of ICU admission", xaxis = a, yaxis = b, showlegend = TRUE)
lab_topsql<-paste("SELECT subject_id, COUNT(GLUCOSE) as total FROM (
SELECT pvt.subject_id, pvt.hadm_id, pvt.icustay_id, pvt.charttime
, (CASE WHEN label = 'GLUCOSE' THEN valuenum ELSE null END) as GLUCOSE
FROM
(SELECT ie.subject_id, ie.hadm_id, ie.icustay_id, charttime,
CASE WHEN itemid in (50809,50931) THEN 'GLUCOSE' ELSE null END AS label, valuenum
FROM `physionet-data.mimiciii_clinical.icustays` ie
LEFT JOIN `physionet-data.mimiciii_clinical.labevents` le
ON le.subject_id = ie.subject_id AND le.hadm_id = ie.hadm_id
-- the last 6 hours might have nurses notes with information
AND le.charttime BETWEEN DATETIME_ADD(ie.intime, INTERVAL -6 hour) AND DATETIME_ADD(ie.intime, INTERVAL 24 hour)
AND le.ITEMID in (50931, 50809) AND valuenum IS NOT null AND valuenum > 0) pvt) group by subject_id order by total DESC")
df_glu<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE)
sql<-paste("SELECT subject_id, COUNT(HEMOGLOBIN) as total FROM (
SELECT pvt.subject_id, pvt.hadm_id, pvt.icustay_id, pvt.charttime
, (CASE WHEN label = 'HEMOGLOBIN' THEN valuenum ELSE null END) as HEMOGLOBIN
FROM
(SELECT ie.subject_id, ie.hadm_id, ie.icustay_id, charttime,
CASE WHEN itemid in (51222,50811) THEN 'HEMOGLOBIN' ELSE null END AS label, valuenum
FROM `physionet-data.mimiciii_clinical.icustays` ie
LEFT JOIN `physionet-data.mimiciii_clinical.labevents` le
ON le.subject_id = ie.subject_id AND le.hadm_id = ie.hadm_id
-- the last 6 hours might have nurses notes with information
AND le.charttime BETWEEN DATETIME_ADD(ie.intime, INTERVAL -6 hour) AND DATETIME_ADD(ie.intime, INTERVAL 24 hour)
AND le.ITEMID in (51222,50811) AND valuenum IS NOT null AND valuenum > 0) pvt) group by subject_id order by total Desc")
df_hemog<-query_exec(sql, project = project_HST,use_legacy_sql = FALSE) Number of samples of Glucose and Hemoglobin per patient in the first 24 hours of ICU admission
When plotting the number of samples per patient, we can assess if all or most of the patients have information of this variable and how many samples. We observe that most patients have less than 20 samples in the first 24 hours of ICU admission (less than 1 sample per hour) for both laboratory variables.
library(plotly)
a <- list(
title = "# patients",
showticklabels = TRUE
)
b <- list(
title = "# samples",
showticklabels = TRUE,
range = c(-1, 10.5)
)
lab_samples <- plot_ly(x = df_glu$total, name = 'Glucose [mg/dL]', type = 'histogram', marker=list(line = list( color = 'blue'), color = 'blue')) %>%
add_trace(x = df_hemog$total, name = 'Hemoglobin [g/dL]', type = 'histogram', marker=list(line = list( color = '#0abde3'), opacity=0.6, color = '#0abde3')) %>%
layout(title = "No. samples per patient in the first 24 hours of ICU admission", xaxis = b, yaxis = a, showlegend = TRUE, barmode = 'group', bargap = 0.15, bargroupgap = 0.1)
lab_samples